`— title: “STA 222 Final Project” output: html_document date: “2022-12-03” —

Loading Libraries

library(survival)
library(KMsurv)
library(ggplot2)
library(ggpubr)
library(survminer)
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(muhaz)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
library(tidyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
library(gridExtra)

Cleaning Data

# Loading in data
data(std)
std0 = std
# Checking for NA Values in data set 
any(is.na(std))
## [1] FALSE
factor.cols <-
  c(
    "race",
    # Race (W=white, B=black)
    "marital",
    # Marital status (D=divorced / separated, M=married, S=single)
    "iinfct",
    # Initial infection (1= gonorrhea, 2=chlamydia, 3=both)
    "os12m",
    # Oral sex within 12 months (1=yes, 0=no)
    "os30d",
    # Oral sex within 30 days (1=yes, 0=no)
    "rs12m",
    # Rectal sex within 12 months (1=yes, 0=no)
    "rs30d",
    # Rectal sex within 30 days (1=yes, 0=no)
    "abdpain",
    # Presence of abdominal pain (1=yes, 0=no)
    "discharge",
    # Sign of discharge (1=yes, 0=no)
    "dysuria",
    # Sign of dysuria (1=yes, 0=no)
    "condom",
    # Condom use (1=always, 2=sometime, 3=never)
    "itch",
    # Sign of itch (1=yes, 0=no)
    "lesion",
    # Sign of lesion (1=yes, 0=no)
    "rash",
    # Sign of rash (1=yes, 0=no)
    "lymph",
    # Sign of lymph (1=yes, 0=no)
    "vagina",
    # Involvement vagina at exam (1=yes, 0=no)
    "dchexam",
    # Discharge at exam (1=yes, 0=no)
    "abnode" # Abnormal node at exam (1=yes, 0=no)
  )
std[factor.cols] <- lapply(std[factor.cols], as.factor)

std$race <-
  mapvalues(std$race,
            from = c("W", "B"),
            to = c("White", "Black"))
std$marital <-
  mapvalues(
    std$marital,
    from = c("D", "M", "S"),
    to = c("Divorced/Separated", "Married", "Single")
  )
std$iinfct <-
  mapvalues(
    std$iinfct,
    from = c("1", "2", "3"),
    to = c("gonorrhea", "chlamydia", "both")
  )
std$condom <-
  mapvalues(
    std$condom,
    from = c("1", "2", "3"),
    to = c("always", "sometime", "never")
  )

Exploratory Data Analysis

Checking Model Assumptions

# Building the Survival Object 
infect <- std$iinfct
surv_object <- Surv(time = std$time, event = std$rinfct)
## Call:
## survdiff(formula = surv_object ~ iinfct, data = std)
## 
##                    N Observed Expected (O-E)^2/E (O-E)^2/V
## iinfct=gonorrhea 140       73     54.5   6.28042   7.50617
## iinfct=chlamydia 396      135    153.0   2.12201   3.81136
## iinfct=both      341      139    139.5   0.00166   0.00278
## 
##  Chisq= 8.5  on 2 degrees of freedom, p= 0.01

Cumulative Hazards and CLogLog

## Call:
## coxph(formula = surv_object ~ iinfct, data = std)
## 
##   n= 877, number of events= 347 
## 
##                    coef exp(coef) se(coef)      z Pr(>|z|)   
## iinfctchlamydia -0.4202    0.6569   0.1457 -2.884  0.00393 **
## iinfctboth      -0.2980    0.7423   0.1450 -2.055  0.03984 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6569      1.522    0.4937    0.8741
## iinfctboth         0.7423      1.347    0.5587    0.9863
## 
## Concordance= 0.524  (se = 0.016 )
## Likelihood ratio test= 7.93  on 2 df,   p=0.02
## Wald test            = 8.37  on 2 df,   p=0.02
## Score (logrank) test = 8.46  on 2 df,   p=0.01

Building the Model

cox1 <-
  coxph(
    surv_object ~ iinfct + marital + race + os12m + os30d +
      rs12m + rs30d + abdpain + discharge + dysuria + condom + 
      itch + lesion + rash + lymph + vagina + dchexam + abnode +
      age + yschool + npartner,
    data = std
  )
summary(cox1)
## Call:
## coxph(formula = surv_object ~ iinfct + marital + race + os12m + 
##     os30d + rs12m + rs30d + abdpain + discharge + dysuria + condom + 
##     itch + lesion + rash + lymph + vagina + dchexam + abnode + 
##     age + yschool + npartner, data = std)
## 
##   n= 877, number of events= 347 
## 
##                      coef exp(coef)  se(coef)      z Pr(>|z|)   
## iinfctchlamydia -0.334628  0.715604  0.149647 -2.236  0.02534 * 
## iinfctboth      -0.267515  0.765279  0.149987 -1.784  0.07449 . 
## maritalMarried   0.055058  1.056602  0.431303  0.128  0.89842   
## maritalSingle    0.408097  1.503953  0.295341  1.382  0.16704   
## raceWhite       -0.111462  0.894526  0.141327 -0.789  0.43030   
## os12m1          -0.206151  0.813711  0.212028 -0.972  0.33091   
## os30d1          -0.339983  0.711783  0.238657 -1.425  0.15428   
## rs12m1           0.033955  1.034538  0.445166  0.076  0.93920   
## rs30d1          -0.194771  0.823023  0.565199 -0.345  0.73039   
## abdpain1         0.229308  1.257729  0.156236  1.468  0.14219   
## discharge1       0.114691  1.121527  0.114283  1.004  0.31559   
## dysuria1         0.164089  1.178320  0.155157  1.058  0.29025   
## condomsometime  -0.064082  0.937928  0.239642 -0.267  0.78916   
## condomnever     -0.321968  0.724721  0.247790 -1.299  0.19382   
## itch1           -0.147451  0.862905  0.154774 -0.953  0.34075   
## lesion1         -0.183670  0.832210  0.333523 -0.551  0.58184   
## rash1            0.008298  1.008333  0.392928  0.021  0.98315   
## lymph1          -0.030840  0.969631  0.547496 -0.056  0.95508   
## vagina1          0.351398  1.421053  0.174868  2.010  0.04448 * 
## dchexam1        -0.463338  0.629180  0.230020 -2.014  0.04397 * 
## abnode1          0.170712  1.186149  0.433788  0.394  0.69392   
## age              0.008096  1.008129  0.013913  0.582  0.56066   
## yschool         -0.128072  0.879790  0.039366 -3.253  0.00114 **
## npartner         0.076670  1.079686  0.053888  1.423  0.15480   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.7156     1.3974    0.5337    0.9595
## iinfctboth         0.7653     1.3067    0.5704    1.0268
## maritalMarried     1.0566     0.9464    0.4537    2.4606
## maritalSingle      1.5040     0.6649    0.8430    2.6830
## raceWhite          0.8945     1.1179    0.6781    1.1800
## os12m1             0.8137     1.2289    0.5370    1.2330
## os30d1             0.7118     1.4049    0.4459    1.1363
## rs12m1             1.0345     0.9666    0.4323    2.4756
## rs30d1             0.8230     1.2150    0.2718    2.4918
## abdpain1           1.2577     0.7951    0.9260    1.7083
## discharge1         1.1215     0.8916    0.8965    1.4031
## dysuria1           1.1783     0.8487    0.8693    1.5971
## condomsometime     0.9379     1.0662    0.5864    1.5002
## condomnever        0.7247     1.3798    0.4459    1.1778
## itch1              0.8629     1.1589    0.6371    1.1687
## lesion1            0.8322     1.2016    0.4328    1.6000
## rash1              1.0083     0.9917    0.4668    2.1780
## lymph1             0.9696     1.0313    0.3316    2.8355
## vagina1            1.4211     0.7037    1.0087    2.0020
## dchexam1           0.6292     1.5894    0.4008    0.9876
## abnode1            1.1861     0.8431    0.5069    2.7758
## age                1.0081     0.9919    0.9810    1.0360
## yschool            0.8798     1.1366    0.8145    0.9504
## npartner           1.0797     0.9262    0.9715    1.2000
## 
## Concordance= 0.635  (se = 0.016 )
## Likelihood ratio test= 73.37  on 24 df,   p=7e-07
## Wald test            = 69.89  on 24 df,   p=2e-06
## Score (logrank) test = 71.71  on 24 df,   p=1e-06
## Single term deletions
## 
## Model:
## surv_object ~ iinfct + marital + race + os12m + os30d + rs12m + 
##     rs30d + abdpain + discharge + dysuria + condom + itch + lesion + 
##     rash + lymph + vagina + dchexam + abnode + age + yschool + 
##     npartner
##           Df    AIC     LRT Pr(>Chi)   
## <none>       4121.2                    
## iinfct     2 4122.2  4.9877 0.082590 . 
## marital    2 4119.8  2.6535 0.265345   
## race       1 4119.8  0.6300 0.427371   
## os12m      1 4120.2  0.9869 0.320512   
## os30d      1 4121.1  1.9585 0.161675   
## rs12m      1 4119.2  0.0058 0.939450   
## rs30d      1 4119.3  0.1175 0.731758   
## abdpain    1 4121.2  2.0656 0.150654   
## discharge  1 4120.2  1.0055 0.315972   
## dysuria    1 4120.2  1.0820 0.298256   
## condom     2 4122.2  5.0381 0.080535 . 
## itch       1 4120.1  0.9324 0.334250   
## lesion     1 4119.5  0.3156 0.574248   
## rash       1 4119.2  0.0004 0.983170   
## lymph      1 4119.2  0.0032 0.954914   
## vagina     1 4122.9  3.7409 0.053095 . 
## dchexam    1 4122.8  3.6128 0.057335 . 
## abnode     1 4119.3  0.1482 0.700309   
## age        1 4119.5  0.3321 0.564436   
## yschool    1 4129.7 10.5087 0.001188 **
## npartner   1 4121.0  1.8274 0.176434   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cox.model = coxph(surv_object ~ iinfct+condom+vagina
                  +dchexam+yschool, data = std)
summary(cox.model)
## Call:
## coxph(formula = surv_object ~ iinfct + condom + vagina + dchexam + 
##     yschool, data = std)
## 
##   n= 877, number of events= 347 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## iinfctchlamydia -0.38468   0.68067  0.14618 -2.632  0.00850 ** 
## iinfctboth      -0.24681   0.78129  0.14561 -1.695  0.09006 .  
## condomnever     -0.29779   0.74245  0.11551 -2.578  0.00994 ** 
## vagina1          0.40660   1.50171  0.16739  2.429  0.01514 *  
## dchexam1        -0.37042   0.69044  0.22123 -1.674  0.09405 .  
## yschool         -0.14357   0.86626  0.03332 -4.308 1.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6807     1.4692    0.5111    0.9065
## iinfctboth         0.7813     1.2799    0.5873    1.0393
## condomnever        0.7425     1.3469    0.5920    0.9311
## vagina1            1.5017     0.6659    1.0817    2.0848
## dchexam1           0.6904     1.4483    0.4475    1.0652
## yschool            0.8663     1.1544    0.8115    0.9247
## 
## Concordance= 0.603  (se = 0.017 )
## Likelihood ratio test= 45.32  on 6 df,   p=4e-08
## Wald test            = 46.51  on 6 df,   p=2e-08
## Score (logrank) test = 46.77  on 6 df,   p=2e-08

Checking the Model

cox.modelNoSchool <- coxph(surv_object ~ iinfct+condom+vagina+os30d+abdpain, data = std)
martNoSchool <- residuals(cox.modelNoSchool, type = "martingale")

# We plot martingale residuals to see if transformation is appropriate 
lowessOBJ <- as.data.frame(lowess(std$yschool, martNoSchool))

ggplotly(
  ggplot() + aes(std$yschool, martNoSchool) + geom_point() + 
    labs(x = "Years of School", y = "Martingale Residuals", title = "Martingale Residuals vs. Years of School") + 
    geom_line(data = lowessOBJ, aes(x = x, y = y), col = "#6495ed")
)
##          chisq df    p
## iinfct  3.5184  2 0.17
## condom  1.0539  1 0.30
## vagina  1.2835  1 0.26
## dchexam 0.4085  1 0.52
## yschool 0.0214  1 0.88
## GLOBAL  6.1176  6 0.41

Outliers

##     obs  race            marital age yschool    iinfct npartner os12m os30d
## 11   11 Black Divorced/Separated  32      12      both        6     1     1
## 366 366 White             Single  14       9 gonorrhea        1     0     0
## 525 525 Black             Single  15       8      both        1     1     1
## 831 831 White             Single  20      12 chlamydia       10     1     1
##     rs12m rs30d abdpain discharge dysuria condom itch lesion rash lymph vagina
## 11      0     0       1         1       0    use    0      0    0     0      1
## 366     0     0       0         0       1    use    0      0    0     0      0
## 525     0     0       0         1       0  never    0      0    0     0      1
## 831     1     1       0         1       0    use    1      0    0     0      1
##     dchexam abnode rinfct time
## 11        1      0      0 1468
## 366       1      0      0 1439
## 525       1      0      0 1005
## 831       0      0      0 1027